1 Introduction

In this document, we will try to understand the thresholded cricketer scores.

The hypothesis is that, after a cricketer scores above a certain number, he will likely score more (after the initial score is subtracted).

library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

2 Loading data

clean_test_batting_inngings = readr::read_csv("./clean_test_batting_inngings.csv")
## Parsed with column specification:
## cols(
##   country = col_character(),
##   player_name = col_character(),
##   runs = col_character(),
##   mins = col_double(),
##   balls_faced = col_double(),
##   fours = col_double(),
##   sixes = col_double(),
##   strike_rate = col_double(),
##   pos = col_double(),
##   dismissal = col_character(),
##   inns = col_double(),
##   opposition = col_character(),
##   ground = col_character(),
##   start_date = col_character(),
##   test_number = col_character()
## )

3 Data cleaning

inns_data = clean_test_batting_inngings %>% 
  dplyr::mutate(
    not_out = runs %>% stringr::str_detect("[*]") | runs %>% stringr::str_detect("DNB"),
    out = !not_out,
    runs_num = runs %>% stringr::str_extract("\\-*\\d+\\.*\\d*") %>% as.integer() %>% 
      coalesce(0L)
  )

# thres_data %>% 
#   group_by(player_name) %>% 
#   summarise(career_avg = sum(runs_num)/sum(out))

3.1 Threshold at 30 check

thres_data_30 = inns_data %>% 
  dplyr::mutate(
    runs_num_30 = ifelse(runs_num - 30L < 0,
                         0, runs_num - 30L)) %>% 
  group_by(player_name) %>%
  summarise(career_avg = sum(runs_num_30)/sum(out))

4 Only keeping the best 100 batsmen in cricket avg

career_data = inns_data %>%
  group_by(player_name) %>%
  dplyr::summarise(
    career_avg = sum(runs_num)/sum(out),
    inns = n()) 

career_data_top100 = career_data %>% 
  dplyr::filter(inns >= 20L) %>% 
  dplyr::top_n(n = 20, wt = career_avg)

4.1 Threholding at different number of runs

inns_data_top100 = inns_data %>% 
  dplyr::filter(player_name %in% career_data_top100$player_name)


thres_career_avg = function(data, thres){
  thres_data = data %>% 
  dplyr::mutate(
    runs_num_thres = ifelse(runs_num - thres < 0,
                         0, runs_num - thres)) %>% 
  group_by(player_name) %>%
  summarise(career_avg_thres = sum(runs_num_thres)/sum(out))
  
  return(thres_data)
}

# thres_career_avg(data = inns_data_top100, thres = 30L)

thres = 0:30
list_thres_career_avg = purrr::map(
  .x = thres, 
  .f = ~ thres_career_avg(data = inns_data_top100, thres = .x)
)

4.2 Visualisation

mat_thres_career_avg = list_thres_career_avg %>% 
  purrr::map("career_avg_thres") %>% 
  do.call(cbind, .)

df_thres_career_avg = data.frame(mat_thres_career_avg) %>% 
  tibble::as_tibble()
colnames(df_thres_career_avg) = sprintf("%02d", thres)

df_thres_career_avg = df_thres_career_avg %>% 
  dplyr::mutate(player_name = list_thres_career_avg[[1]]$player_name)

df_thres_career_avg_long = df_thres_career_avg %>% 
  tidyr::gather(key = thres, 
                value = career_avg_thres, 
                -player_name) %>% 
  group_by(thres) %>% 
  dplyr::mutate(career_avg_thres_rank = rank(career_avg_thres))


df_thres_career_avg_long %>% 
  dplyr::filter(player_name != "DG Bradman") %>% 
  ggplot(aes(x = thres, 
             y = career_avg_thres,
             colour = player_name)) +
  geom_point() +
  geom_line(aes(group = player_name)) +
  theme(legend.position = "none")

  # scale_colour_brewer(palette = "Set1")

df_thres_career_avg_long %>% 
  # dplyr::filter(player_name != "DG Bradman") %>% 
  ggplot(aes(x = thres, 
             y = career_avg_thres_rank,
             colour = player_name)) +
  geom_point() +
  geom_line(aes(group = player_name)) +
  theme(legend.position = "none")

  # scale_colour_brewer(palette = "Set1")

plotly::ggplotly(last_plot())

5 Session Info

sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] forcats_0.4.0   stringr_1.4.0   dplyr_0.8.3     purrr_0.3.2    
## [5] readr_1.3.1     tidyr_0.8.3     tibble_2.1.3    ggplot2_3.2.1  
## [9] tidyverse_1.2.1
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_0.2.5  xfun_0.9          haven_2.1.1      
##  [4] lattice_0.20-38   colorspace_1.4-1  generics_0.0.2   
##  [7] vctrs_0.2.0       htmltools_0.3.6   viridisLite_0.3.0
## [10] yaml_2.2.0        plotly_4.9.0      rlang_0.4.0      
## [13] later_0.8.0       pillar_1.4.2      glue_1.3.1       
## [16] withr_2.1.2       modelr_0.1.5      readxl_1.3.1     
## [19] munsell_0.5.0     gtable_0.3.0      cellranger_1.1.0 
## [22] rvest_0.3.4       htmlwidgets_1.3   evaluate_0.14    
## [25] labeling_0.3      knitr_1.24        httpuv_1.5.1     
## [28] crosstalk_1.0.0   Cairo_1.5-10      broom_0.5.2      
## [31] Rcpp_1.0.2        xtable_1.8-4      promises_1.0.1   
## [34] scales_1.0.0      backports_1.1.4   jsonlite_1.6     
## [37] mime_0.7          hms_0.5.1         digest_0.6.20    
## [40] stringi_1.4.3     shiny_1.3.2       grid_3.6.0       
## [43] cli_1.1.0         tools_3.6.0       magrittr_1.5     
## [46] lazyeval_0.2.2    crayon_1.3.4      pkgconfig_2.0.2  
## [49] zeallot_0.1.0     data.table_1.12.2 xml2_1.2.2       
## [52] lubridate_1.7.4   assertthat_0.2.1  rmarkdown_1.15   
## [55] httr_1.4.1        rstudioapi_0.10   R6_2.4.0         
## [58] nlme_3.1-141      compiler_3.6.0